Presented by Group 17:
Date: 06.01.2018
Due date: Sunday, 07. Jan. 2018, 22:00
Please hand in a copy of the solved notebook and a html-export of it.
Remark: To keep the file size low, the notebook contains png images of solutions you should obtain. Double-click in the images to see the markdown code - do not execute these cells as you do not have the images.
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from bokeh.models import ColumnDataSource, ColorBar, LinearColorMapper, CategoricalColorMapper
from bokeh.models import Arrow, NormalHead, LabelSet, Label
from bokeh.plotting import figure, output_notebook, show
from bokeh.palettes import Category10, Category20, Viridis
from bokeh.transform import factor_cmap, linear_cmap
from bokeh.io import export_png
from bokeh.layouts import gridplot, row
from bokeh.core.properties import value
output_notebook()
# load the data
cars = pd.read_csv( '93cars.dat.csv', sep='\s+', na_values='*')
# substitute missing values
cars.LuggageCapacity = cars.LuggageCapacity.fillna(cars.LuggageCapacity.median())
cars.Cylinders = cars.Cylinders.fillna(cars.Cylinders.median())
cars.RearSeatRoom = cars.RearSeatRoom.fillna(cars.RearSeatRoom.median())
cars.columns
**Task 1a)** Understand your data.
var.var = ['MinPrice','MidPrice','MaxPrice', 'CityMpg', 'HighwayMpg', 'Cylinders', 'Engine', 'Horsepower', 'RPM', 'EngineRev','Tank', 'Passenger','Length', 'Wheelbase', 'Width','UTurn','RearSeatRoom','LuggageCapacity','Weight']
vars_group1 = ['MidPrice', 'Horsepower', 'CityMpg', 'Weight']
scale_factor, position_factor = 7.0, 8.0
Which variables can be used to divide the cars into groups? How many groups do you obtain and what size do they have? Do you expect differences between the groups and if yes which?
**Task 1b)** Compute the PCA.
scikit-learn PCA: http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
# store standardized data in cars_std
cars_std = StandardScaler().fit_transform(cars[var].iloc[0:,:])
cars_group = StandardScaler().fit_transform(cars[vars_group1].iloc[0:,:])
# store PCA in variable pca
pca = PCA(n_components=cars_std.shape[1])
pca.fit(cars_std)
# store PCA in variable pca
pca_group = PCA(n_components=cars_group.shape[1])
pca_group.fit(cars_group)
Important variables for further analysis (see docu above):
pca.components_
pca.explained_variance_
pca.explained_variance_ratio_
**Task 1c)** How many principal components to use.
var_exp = pca.explained_variance_ratio_*100
cum_var_exp = np.cumsum(var_exp)
x = ['PC%s' %(i+1) for i in range(len(var))]
source = ColumnDataSource( dict(x=x, var_exp=var_exp, cum_var_exp=cum_var_exp) )
p = figure( plot_width=520, plot_height=400, toolbar_location=None, x_range=x, y_range=(-2,105), title="Cumulative Variance for PC Axis in Percentage" )
p.vbar( source=source, x='x', top='var_exp', width=0.9, bottom=0, legend='Explained variance' )
p.circle( x, cum_var_exp, color='orange', size=5, legend="Cumulative explained variance")
p.line( x, cum_var_exp, color='orange', line_width=2 )
p.legend.location = (235,155)
p.legend.border_line_color = None
p.xgrid.visible = False
p.xaxis.axis_label = "Principal Component"
p.yaxis.axis_label = "Explained variance in percent"
show(p)
Here is a sample image to check your routine:

**Task 1d)** Interpret the axis.
The following chart presents a projection of the cars data onto the first two principal components. Use techniques you learned in class to derive the meaning of the axes.
pca_auto = pd.DataFrame( pca.transform(cars_std), columns=['PC%i' % (i+1) for i in range(pca.n_components_)])
pca_auto['label'] = cars.Type
pca_auto['label'] = pca_auto['label'].astype('str')
factors = sorted(pca_auto.label.unique())
source = ColumnDataSource(pca_auto)
p = figure( plot_width=600, plot_height=600, y_range=(-4.5,4.8), title="PCA Biplot with all Quantitavie Variables")
p.circle( source=source, x='PC1', y='PC2', size=9, legend='label',
color=factor_cmap('label', palette=Category10[10], factors=factors))
feature_vectors = pca.components_.T
for i, v in enumerate(feature_vectors):
p.add_layout(Arrow(end=NormalHead(fill_color="orange",size=10,line_width=2), x_start=0, y_start=0, x_end=scale_factor*v[0], y_end=scale_factor*v[1]))
p.add_layout(Label(x=position_factor*v[0], y=position_factor*v[1], text=var[i], render_mode='canvas', background_fill_color='white', text_font_size='10pt'))
p.xaxis.axis_label = 'PC1'
p.yaxis.axis_label = 'PC2'
p.legend.location = "bottom_right"
show(p)
Explain the PC1 and PC2 axis. What is typical for cars on the left vs. right (x-axis)? What is typical for cars located towards the top vs. bottom of the chart (y-axis)?
pca_auto = pd.DataFrame( pca_group.transform(cars_group), columns=['PC%i' % (i+1) for i in range(pca_group.n_components_)])
pca_auto['label'] = cars.Type
pca_auto['label'] = pca_auto['label'].astype('str')
factors = sorted(pca_auto.label.unique())
source = ColumnDataSource(pca_auto)
p = figure( plot_width=600, plot_height=600, y_range=(-8,8), title="PCA Biplot with Grouped Variables")
p.circle( source=source, x='PC1', y='PC2', size=9, legend='label',
color=factor_cmap('label', palette=Category10[10], factors=factors))
feature_vectors = pca_group.components_.T
for i, v in enumerate(feature_vectors):
p.add_layout(Arrow(end=NormalHead(fill_color="orange",size=10,line_width=2), x_start=0, y_start=0, x_end=scale_factor*v[0], y_end=scale_factor*v[1]))
p.add_layout(Label(x=position_factor*v[0], y=position_factor*v[1], text=vars_group1[i], render_mode='canvas', background_fill_color='white', text_font_size='10pt'))
p.xaxis.axis_label = 'PC1'
p.yaxis.axis_label = 'PC2'
p.legend.location = "bottom_right"
show(p)
You should get the following chart:

The following code reads the tumor data and does some preprocessing. You have access to the following variables:
# read the gene expression data, transpose it, and name the columns
df = pd.read_csv('nci.csv', header=None, sep='\s+')
df = df.T
df.columns = ['G%i' % i for i in range(len(df.columns))]
ngenes = len(df.columns)
# read the labels and add them to the dataframe
labels = pd.read_csv( 'nci-labels.csv' )
df['cancer'] = labels
# remove synthetic tumors
df = df[df['cancer'].isin(['CNS', 'RENAL', 'BREAST', 'NSCLC', 'UNKNOWN', 'OVARIAN', 'MELANOMA',
'PROSTATE', 'LEUKEMIA', 'COLON' ])]
df = df.reset_index()
labels = df.cancer
# get a standardized version of the data
df_std = StandardScaler().fit_transform(df.iloc[:,1:-1])
# get the occuring cancer types
cancer_types = sorted(labels.unique())
print cancer_types
Helper routines
def getLegend( df ):
df = df.sort_values(by=['cancer'])
source = ColumnDataSource(df)
l = figure( plot_width=200, plot_height=300, y_range=(100,101) )
l.circle( source=source, x='G0', y='G1', size=7, legend='cancer', color=factor_cmap('cancer', palette=Category10[10], factors=cancer_types))
l.xaxis.visible = False
l.yaxis.visible = False
l.xgrid.visible = False
l.ygrid.visible = False
l.legend.location = "top_left"
l.legend.border_line_color = None
return l
df.cancer.value_counts()
**Task 2a)** Make a feasibility check.
row = []
def getRandomGenePlot( df ):
p = figure( plot_width=300, plot_height=300 )
ids = np.random.choice( len(df.columns)-1, 2 )
var1 = 'G'+str(ids[0])
var2 = 'G'+str(ids[1])
p.circle( source=df, x=var1, y=var2, size=7,
color=factor_cmap('cancer', palette=Category10[10], factors=cancer_types))
p.xaxis.axis_label = var1
p.yaxis.axis_label = var2
return p
for i in range(6):
row.append(getRandomGenePlot(df))
row.append(getLegend(df))
show( gridplot([[row[0], row[1], row[2]], [row[3], row[4], row[5]], [None, None, row[6]]]))